import numpy as np
import pandas as pd
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
In this article, we analyze the Heart Disease Dataset from the UCI Machine Learning Repository.

Picture Source: harvard.edu
The object of the exercise is to develop a predictive model that can predict whether heart disease is present or absent based on the rest of the given features.
File = 'heart-disease/heart.dat'
Attributes = ['Age', 'Sex', 'Chest Pain Type', 'Resting Blood Pressure', 'Serum Cholestoral',
'Fasting Blood Sugar', 'Resting Electrocardiographic Results', 'Maximum Heart Rate Achieved',
'Exercise Induced Angina', 'Oldpeak', 'Slope', 'Number of Major Vessels', 'Thal', 'Heart Disease']
df = pd.DataFrame(data = np.genfromtxt(File, delimiter=' '), columns = Attributes)
#
Data = df.copy()
Temp = ['Sex', 'Chest Pain Type', 'Fasting Blood Sugar', 'Resting Electrocardiographic Results',
'Exercise Induced Angina', 'Slope', 'Number of Major Vessels','Thal']
for c in Temp:
Data[c] = Data[c].astype(int).astype(str)
del Temp, c
Target = 'Heart Disease'
Labels = ['Absent', 'Present']
Data['Heart Disease'] = (Data['Heart Disease']-1).astype(int).replace({0: Labels[0], 1: Labels[1]})
Data['Age'] = Data['Age'].astype(int)
def Data_info(Inp, Only_NaN = False):
Out = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = Out.join(Inp.isnull().sum().to_frame(name = 'Number of NaN Values'), how='outer')
Out ['Size'] = Inp.shape[0]
Out['Percentage'] = 100 - np.round(100*(Out['Number of NaN Values']/Inp.shape[0]),2)
Out.index.name = 'Features'
Out['Data Type'] = Out['Data Type'].astype(str)
if Only_NaN:
Out = Out.loc[Out['Number of NaN Values']>0]
return Out
# Maps
Maps = {'Sex': {'0':'Female', '1':'Male'},
'Chest Pain Type': {'1':'Typical Angina', '2':'Atypical Angina', '3': 'Non-Anginal Pain', '4':'Asymptomatic'},
'Fasting Blood Sugar': {'0': 'False', '1': 'True'}, 'Exercise Induced Angina': {'0': 'No', '1': 'Yes'},
'Slope': {'1': 'Upsloping', '2': 'Flat', '3': 'Downsloping'},
'Thal': {'3': 'Normal', '6': 'Fixed Defect','7': 'Reversable Defect'}}
for c in list(Maps.keys()):
Data[c] = Data[c].replace(Maps[c])
del c
display(Data)
#
display(pd.DataFrame({'Number of Instances': [Data.shape[0]], 'Number of Attributes': [Data.shape[1]]}).style.hide_index())
| Age | Sex | Chest Pain Type | Resting Blood Pressure | Serum Cholestoral | Fasting Blood Sugar | Resting Electrocardiographic Results | Maximum Heart Rate Achieved | Exercise Induced Angina | Oldpeak | Slope | Number of Major Vessels | Thal | Heart Disease | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 70 | Male | Asymptomatic | 130.0 | 322.0 | False | 2 | 109.0 | No | 2.4 | Flat | 3 | Normal | Present |
| 1 | 67 | Female | Non-Anginal Pain | 115.0 | 564.0 | False | 2 | 160.0 | No | 1.6 | Flat | 0 | Reversable Defect | Absent |
| 2 | 57 | Male | Atypical Angina | 124.0 | 261.0 | False | 0 | 141.0 | No | 0.3 | Upsloping | 0 | Reversable Defect | Present |
| 3 | 64 | Male | Asymptomatic | 128.0 | 263.0 | False | 0 | 105.0 | Yes | 0.2 | Flat | 1 | Reversable Defect | Absent |
| 4 | 74 | Female | Atypical Angina | 120.0 | 269.0 | False | 2 | 121.0 | Yes | 0.2 | Upsloping | 1 | Normal | Absent |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 265 | 52 | Male | Non-Anginal Pain | 172.0 | 199.0 | True | 0 | 162.0 | No | 0.5 | Upsloping | 0 | Reversable Defect | Absent |
| 266 | 44 | Male | Atypical Angina | 120.0 | 263.0 | False | 0 | 173.0 | No | 0.0 | Upsloping | 0 | Reversable Defect | Absent |
| 267 | 56 | Female | Atypical Angina | 140.0 | 294.0 | False | 2 | 153.0 | No | 1.3 | Flat | 0 | Normal | Absent |
| 268 | 57 | Male | Asymptomatic | 140.0 | 192.0 | False | 0 | 148.0 | No | 0.4 | Flat | 0 | Fixed Defect | Absent |
| 269 | 67 | Male | Asymptomatic | 160.0 | 286.0 | False | 2 | 108.0 | Yes | 1.5 | Flat | 3 | Normal | Present |
270 rows × 14 columns
| Number of Instances | Number of Attributes |
|---|---|
| 270 | 14 |
def Data_Plot(Inp, W = False):
data_info = Inp.copy()
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if W:
fig.update_layout(width = W)
fig.update_layout(title={'text': '<b>' + 'Dataset Overview' + '<b>', 'x':0.5,
'y': 0.92, 'xanchor': 'center', 'yanchor': 'top'}, yaxis_title='Frequency')
fig.show()
return data_info
_ = Data_Plot(Data)
def DistPlot1(Feat, Target = Target, nbins = 20, Colors = ['LightSalmon', 'LightBlue'], LC = 'Black',
yLim = [0, 80], H = 450, titleY = 0.92, Inp = Data):
fig = px.histogram(Inp, x = Feat, nbins=nbins, color= Target, marginal= 'box',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
Name = '%s Distribution by %s' % (Target, Feat)
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= H, width= 980,
title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=yLim)
fig.show()
def DispPlot2(Feat, ColorFeat, xLim, yLim, Target = Target, nbins = 5, titleY = 0.90, LC = 'Black', H2 = 320, W = 980,
Colors1 = ['Pink', 'BlueViolet'], Colors2 = ['OrangeRed', 'LimeGreen'], Inp = Data):
# Top Figure
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True,
subplot_titles=('%s: <b>%s<b>' % (Target,Labels[0]), '%s: <b>%s<b>' % (Target,Labels[1])))
for j in range(2):
Temp = Inp.loc[Inp[Target] == Labels[j]].sort_values(by = ColorFeat)
if Colors1 == None:
figtemp = px.histogram(Temp, x= Feat, color = ColorFeat, nbins = nbins)
else:
figtemp = px.histogram(Temp, x= Feat, color = ColorFeat, nbins = nbins,
color_discrete_sequence = Colors1)
for i in range(len(figtemp['data'])):
fig.add_trace(figtemp['data'][i], row=1, col=j+1)
del figtemp, Temp
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1)
fig.update_yaxes(title_text ='Frequency', row=1, col=1)
fig.update_traces(showlegend = False, row=1, col=2)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray', range=xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range=yLim)
fig.update_layout(plot_bgcolor= 'white', barmode='stack', width = W)
Name = '%s Distribution by %s and %s' % (Feat, ColorFeat, Target)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
# Bottom Figure
fig = px.box(Data, x= Feat, y = ColorFeat, color = Target,
color_discrete_sequence= Colors2, hover_data=Data.columns)
fig.update_traces(quartilemethod='linear')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray', range=xLim)
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
fig.update_layout(plot_bgcolor= 'white', barmode='stack', height= H2, width = W)
fig.show()
def Correlation_Plot (Inp, Fig_Size = 12, annot_kws = 11):
Correlation_Matrix = Inp.corr().round(2)
mask = np.zeros_like(Correlation_Matrix)
mask[np.triu_indices_from(mask)] = True
for i in range(len(mask)):
mask[i,i]=0
fig, ax = plt.subplots(figsize=(Fig_Size, Fig_Size))
sns.heatmap(Correlation_Matrix, ax=ax, mask=mask, annot=True, square=True,
cmap =sns.color_palette("Greens", n_colors=10), linewidths = 0.2, vmin=0, vmax=1,
cbar_kws={'label': 'Correlation', "aspect":30, "shrink": .4}, annot_kws={"size": annot_kws})
return Correlation_Matrix
def Feature_Correlation(Inp, Target = Target, annot_kws = 11, FS = (17, 16), rep_long_labels = False):
Fig, ax = plt.subplots(figsize = FS)
Temp = Inp.corr().round(2)
Temp = Temp.loc[(Temp.index == Target)].drop(columns = Target).T.sort_values(by = Target).T
_ = sns.heatmap(Temp, ax=ax, annot=True, square=True, cmap =sns.color_palette("YlGn", n_colors=Temp.shape[1]),
linewidths = 0.8, vmin=0, vmax=1,
annot_kws={"size": annot_kws},
cbar_kws={'label': 'Correlation with %s' % Target,
"aspect":40, "shrink": .4, "orientation": "horizontal"})
if rep_long_labels:
labels = [x.replace(' ','\n').replace('\nof\n',' of\n')
for x in [item.get_text() for item in ax.get_xticklabels()]]
_ = ax.set_xticklabels(labels)
_ = ax.set_yticklabels('')
_ = Correlation_Plot (df, 10)
# Feature_Correlation(df, annot_kws = 11, FS = (12, 16), rep_long_labels = True)
DispPlot2(Feat = 'Age', ColorFeat = 'Sex', xLim = [10, 90], yLim = [0, 60])
DispPlot2(Feat = 'Resting Blood Pressure', ColorFeat = 'Fasting Blood Sugar', xLim = [80, 200], yLim = [0, 40], nbins = 20)
DispPlot2(Feat = 'Serum Cholestoral', ColorFeat = 'Fasting Blood Sugar', xLim = [0, 700], yLim = [0, 70], nbins = 10)
In this section, we demonstrate the relationship between the maximum heart rate achieved and heart disease.
DispPlot2(Feat = 'Maximum Heart Rate Achieved', ColorFeat = 'Chest Pain Type',
Colors1 = ['Snow','MistyRose', 'Tomato', 'DarkRed'],
xLim = [80, 230], yLim = [0, 40], nbins = 20, H2 = 450)
Detrano, R., Janosi, A., Steinbrunn, W., Pfisterer, M., Schmid, J.J., Sandhu, S., Guppy, K.H., Lee, S. and Froelicher, V., 1989. International application of a new probability algorithm for the diagnosis of coronary artery disease. The American journal of cardiology, 64(5), pp.304-310.
Aha, D. and Kibler, D., 1988. Instance-based prediction of heart-disease presence with the Cleveland database. University of California, 3(1), pp.3-2.
Gennari, J.H., Langley, P. and Fisher, D., 1989. Models of incremental concept formation. Artificial intelligence, 40(1-3), pp.11-61.